Step 4: Put them all together

Create a new Python file and call it speech_analyzer.py

In this exercise, we'll set up a language model (LLM) instance, which could be IBM WatsonxLLM, HuggingFaceHub, or an OpenAI model. Then, we'll establish a prompt template. These templates are structured guides to generate prompts for language models, aiding in output organization (more info in langchain prompt template.

Next, we'll develop a transcription function that employs the OpenAI Whisper model to convert speech-to-text. This function takes an audio file uploaded through a Gradio app interface (preferably in .mp3 format). The transcribed text is then fed into an LLMChain, which integrates the text with the prompt template and forwards it to the chosen LLM. The final output from the LLM is then displayed in the Gradio app's output textbox.

The output should look:

Notice how the LLM corrected a minor mistake made by the speech-to-text model, resulting in a coherent and accurate output.

Exercise: Fill the missing parts:

  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10
  11. 11
  12. 12
  13. 13
  14. 14
  15. 15
  16. 16
  17. 17
  18. 18
  19. 19
  20. 20
  21. 21
  22. 22
  23. 23
  24. 24
  25. 25
  26. 26
  27. 27
  28. 28
  29. 29
  30. 30
  31. 31
  32. 32
  33. 33
  34. 34
  35. 35
  36. 36
  37. 37
  38. 38
  39. 39
  40. 40
  41. 41
  42. 42
  43. 43
  44. 44
  45. 45
  46. 46
  47. 47
  48. 48
  49. 49
  50. 50
  51. 51
  52. 52
  53. 53
  54. 54
  55. 55
  56. 56
  57. 57
  58. 58
  59. 59
  60. 60
  61. 61
  62. 62
  63. 63
  1. import torch
  2. import os
  3. import gradio as gr
  4. #from langchain.llms import OpenAI
  5. from langchain.llms import HuggingFaceHub
  6. from transformers import pipeline
  7. from langchain.prompts import PromptTemplate
  8. from langchain.chains import LLMChain
  9. from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM
  10. from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
  11. from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
  12. from ibm_watson_machine_learning.foundation_models import Model
  13. #######------------- LLM-------------####
  14. # initiate LLM instance, this can be IBM WatsonX, huggingface, or OpenAI instance
  15. llm = ###---> write your code here
  16. #######------------- Prompt Template-------------####
  17. # This template is structured based on LLAMA2. If you are using other LLMs, feel free to remove the tags
  18. temp = """
  19. <s><<SYS>>
  20. List the key points with details from the context:
  21. [INST] The context : {context} [/INST]
  22. <</SYS>>
  23. """
  24. # here is the simplified version of the prompt template
  25. # temp = """
  26. # List the key points with details from the context:
  27. # The context : {context}
  28. # """
  29. pt = PromptTemplate(
  30. input_variables=["context"],
  31. template= temp)
  32. prompt_to_LLAMA2 = LLMChain(llm=llm, prompt=pt)
  33. #######------------- Speech2text-------------####
  34. def transcript_audio(audio_file):
  35. # Initialize the speech recognition pipeline
  36. pipe = #------> write the code here
  37. # Transcribe the audio file and return the result
  38. transcript_txt = pipe(audio_file, batch_size=8)["text"]
  39. # run the chain to merge transcript text with the template and send it to the LLM
  40. result = prompt_to_LLAMA2.run(transcript_txt)
  41. return result
  42. #######------------- Gradio-------------####
  43. audio_input = gr.Audio(sources="upload", type="filepath")
  44. output_text = gr.Textbox()
  45. # Create the Gradio interface with the function, inputs, and outputs
  46. iface = #---> write code here
  47. iface.launch(server_name="0.0.0.0", server_port=7860)
Click here for the answer
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10
  11. 11
  12. 12
  13. 13
  14. 14
  15. 15
  16. 16
  17. 17
  18. 18
  19. 19
  20. 20
  21. 21
  22. 22
  23. 23
  24. 24
  25. 25
  26. 26
  27. 27
  28. 28
  29. 29
  30. 30
  31. 31
  32. 32
  33. 33
  34. 34
  35. 35
  36. 36
  37. 37
  38. 38
  39. 39
  40. 40
  41. 41
  42. 42
  43. 43
  44. 44
  45. 45
  46. 46
  47. 47
  48. 48
  49. 49
  50. 50
  51. 51
  52. 52
  53. 53
  54. 54
  55. 55
  56. 56
  57. 57
  58. 58
  59. 59
  60. 60
  61. 61
  62. 62
  63. 63
  64. 64
  65. 65
  66. 66
  67. 67
  68. 68
  69. 69
  70. 70
  71. 71
  72. 72
  73. 73
  1. import torch
  2. import os
  3. import gradio as gr
  4. #from langchain.llms import OpenAI
  5. from langchain.llms import HuggingFaceHub
  6. from transformers import pipeline
  7. from langchain.prompts import PromptTemplate
  8. from langchain.chains import LLMChain
  9. from ibm_watson_machine_learning.foundation_models import Model
  10. from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM
  11. from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
  12. my_credentials = {
  13. "url" : "https://us-south.ml.cloud.ibm.com"
  14. }
  15. params = {
  16. GenParams.MAX_NEW_TOKENS: 800, # The maximum number of tokens that the model can generate in a single run.
  17. GenParams.TEMPERATURE: 0.1, # A parameter that controls the randomness of the token generation. A lower value makes the generation more deterministic, while a higher value introduces more randomness.
  18. }
  19. LLAMA2_model = Model(
  20. model_id= 'meta-llama/llama-3-2-11b-vision-instruct',
  21. credentials=my_credentials,
  22. params=params,
  23. project_id="skills-network",
  24. )
  25. llm = WatsonxLLM(LLAMA2_model)
  26. #######------------- Prompt Template-------------####
  27. temp = """
  28. <s><<SYS>>
  29. List the key points with details from the context:
  30. [INST] The context : {context} [/INST]
  31. <</SYS>>
  32. """
  33. pt = PromptTemplate(
  34. input_variables=["context"],
  35. template= temp)
  36. prompt_to_LLAMA2 = LLMChain(llm=llm, prompt=pt)
  37. #######------------- Speech2text-------------####
  38. def transcript_audio(audio_file):
  39. # Initialize the speech recognition pipeline
  40. pipe = pipeline(
  41. "automatic-speech-recognition",
  42. model="openai/whisper-tiny.en",
  43. chunk_length_s=30,
  44. )
  45. # Transcribe the audio file and return the result
  46. transcript_txt = pipe(audio_file, batch_size=8)["text"]
  47. result = prompt_to_LLAMA2.run(transcript_txt)
  48. return result
  49. #######------------- Gradio-------------####
  50. audio_input = gr.Audio(sources="upload", type="filepath")
  51. output_text = gr.Textbox()
  52. iface = gr.Interface(fn= transcript_audio,
  53. inputs= audio_input, outputs= output_text,
  54. title= "Audio Transcription App",
  55. description= "Upload the audio file")
  56. iface.launch(server_name="0.0.0.0", server_port=7860)

Run your code:

  1. 1
  1. python3 speech_analyzer.py

If there is no error, run the web app: